#!/usr/bin/env python
# encoding: utf-8
import requests
from bs4 import BeautifulSoup

from console_pipeline import ConsolePipeline
from file_writer_pipeline import FileWriterPipeline
from spider import Spider


class IPProxySpider(object):
    """
    获取IP代理的爬虫，从xicidaili获取代理服务器，并交验其有效性，然后写入文件，
    其他爬虫可以用这个文件随机获取代理IP，进行IP代理
    """
    start_url = []
    headers = None

    def __init__(self):
        self.start_url = self.__generate_url()
        self.content = ""

    def __generate_url(self):
        yield "http://www.xicidaili.com/nn/1"

    def process(self, html):

        items = {}
        soup = BeautifulSoup(html, 'lxml')
        print(soup.head.title.text)
        elements = soup.find(id="ip_list").find_all("tr")[1:]
        print("count:", len(elements))
        for element in elements:
            print("**********", element.find_all("td")[1].text)
            ip = element.find_all("td")[1].text
            port = element.find_all("td")[2].text
            schema = element.find_all("td")[5].text
            content = "%s://%s:%s" % (schema.lower(), ip, port)
            if schema == "HTTP":
                proxies = {'http': content}
            else:
                proxies = {'https': content}
            try:
                # timeout=1: 小于1秒的代理视为无效代理
                response = requests.get(url="http://www.example.com/", proxies=proxies, timeout=1)
                print(response.elapsed.total_seconds())
                if response.status_code == 200:
                    items["content"] = content + "\n"
                else:
                    items = None
            except Exception as e:
                print("time out...")
            yield items



if __name__ == "__main__":
    thread_options = {"multiplethread": False, "queueTimeOut": 2}
    Spider(IPProxySpider(), threadoptions=thread_options) \
        .addPipeline(ConsolePipeline()) \
        .addPipeline(FileWriterPipeline(filename="proxy_ip.txt")) \
        .start()
